College Distance Data Import

College Distance Dataset

collegeDistance <- read_csv("~/CSVs/CollegeDistance.csv")
## New names:
## * `` -> ...1
## Rows: 4739 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): gender, ethnicity, fcollege, mcollege, home, urban, income, region
## dbl (7): ...1, score, unemp, wage, distance, tuition, education
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(collegeDistance)
## # A tibble: 6 × 15
##    ...1 gender ethnicity score fcollege mcollege home  urban unemp  wage
##   <dbl> <chr>  <chr>     <dbl> <chr>    <chr>    <chr> <chr> <dbl> <dbl>
## 1     1 male   other      39.2 yes      no       yes   yes    6.20  8.09
## 2     2 female other      48.9 no       no       yes   yes    6.20  8.09
## 3     3 male   other      48.7 no       no       yes   yes    6.20  8.09
## 4     4 male   afam       40.4 no       no       yes   yes    6.20  8.09
## 5     5 female other      40.5 no       no       no    yes    5.60  8.09
## 6     6 male   other      54.7 no       no       yes   yes    5.60  8.09
## # … with 5 more variables: distance <dbl>, tuition <dbl>, education <dbl>,
## #   income <chr>, region <chr>
library(dplyr)
means = data.frame(collegeDistance %>%
  group_by(gender, ethnicity) %>%
  summarise(mean_score = mean(score)))
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.

Composite test scores and gender

gf_boxplot(score ~ gender,data = collegeDistance, fill =  ~ gender,alpha = 0.75) %>% gf_facet_wrap( ~ ethnicity, nrow = 3, ncol = 2) %>%
  gf_labs(title="Gender and Ethnicity Correlate to Different Average Test Scores",x="Gender",y="Scores") +
  
  geom_violin(alpha = 0.4, color = "grey30") +
  
  geom_text(data = means, aes(x = gender, y = mean_score, label = sprintf("Mean:\n%.2f", mean_score), color = gender),
    position = position_dodge(width = 0.8), vjust=-3, hjust=2, size=3, stroke=5)
## Warning: Ignoring unknown parameters: stroke

China Income Data Import

China Income Dataset

ChinaIncome <- read_csv("~/CSVs/ChinaIncome.csv")
## Rows: 37 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): year, agricultureIncome, commerceIncome, constructionIncome, indust...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(ChinaIncome)
## # A tibble: 6 × 6
##    year agricultureIncome commerceIncome constructionIncome industryIncome
##   <dbl>             <dbl>          <dbl>              <dbl>          <dbl>
## 1  1952              100            100                100            100 
## 2  1953              102.           133                138.           134.
## 3  1954              103.           136.               133.           159.
## 4  1955              112.           138.               152.           169.
## 5  1956              116.           147.               262.           219.
## 6  1957              120.           147.               243.           244.
## # … with 1 more variable: transportIncome <dbl>

Construction Income over Time

const_income_color_sceme = c("TRUE" = "cyan3", "FALSE" = "coral1")
const_income_above_1000 = subset(ChinaIncome,constructionIncome > 1000)
const_income_below_1000 = subset(ChinaIncome,constructionIncome <= 1000)
gf_point(constructionIncome~year,data=ChinaIncome,color = ~ (constructionIncome > 1000)) %>%
  gf_line(constructionIncome~year, data=const_income_above_1000, color=const_income_color_sceme["TRUE"]) %>%
  gf_line(constructionIncome~year, data=const_income_below_1000, color=const_income_color_sceme["FALSE"]) %>% 
  gf_labs(title="Construction Income Shifts to a Linear Trend After 1982", y="Construction Income", x="Year") %>%
  gf_theme(legend.position="none") + 
  # Sets colors based on True or False Condition
  scale_color_manual(
    values = const_income_color_sceme) +
  # Sets legend header
  labs(color="Construction Income")

Industry Income and Agriculture Income

# Sets color for industries
industry = "blue"
agriculture="darkgreen"

# Plots lines for industry and agriculture layered
ind_agr_plot = gf_line(industryIncome~year,data=ChinaIncome,color=industry) %>% gf_line(agricultureIncome~year,data=ChinaIncome,color=agriculture) %>%

# Puts "Industry" and "Agriculture" text on plot with the corresponding color
# wish I could make the font thinner or more spread out...
gf_text(x=1988,y=4750,label="industry",color=industry,hjust = 1.2) %>% gf_text(x=1988,y=500,color=agriculture,label="agriculture", hjust=1.2) %>%
  gf_labs(title="Industry Income Surpases Agricultural Income in the 20th Century",x="Year",y="Income")
ind_agr_plot

ggplotly(ind_agr_plot)